Opera 18 days REIMAGED - Preprocessing QC statistics ¶

Aug 2025¶

In [4]:
import os
NOVA_HOME = '/home/projects/hornsteinlab/Collaboration/NOVA'
NOVA_DATA_HOME = os.path.join(NOVA_HOME, 'input')
LOGS_PATH = os.path.join(NOVA_HOME, "outputs/preprocessing/ManuscriptFinalData_80pct/neuronsDay18/logs/")
PLOT_PATH = os.path.join(NOVA_HOME, "outputs/preprocessing/ManuscriptFinalData_80pct/neuronsDay18/logs/plots")

os.chdir(NOVA_HOME)
import pandas as pd
import contextlib
import io
from IPython.display import display, Javascript

from tools.preprocessing_tools.qc_reports.qc_utils import log_files_qc, run_validate_folder_structure, display_diff, sample_and_calc_variance, \
                                                show_site_survival_dapi_brenner, show_site_survival_dapi_cellpose, \
                                                show_site_survival_dapi_tiling, show_site_survival_target_brenner, \
                                                calc_total_sums, plot_filtering_heatmap, show_total_sum_tables, \
                                                plot_cell_count, plot_catplot, plot_hm_of_mean_cell_count_per_tile, \
                                                run_calc_hist_new
                                                
from tools.preprocessing_tools.qc_reports.qc_config import opera18days_panels, opera18days_markers, opera18days_marker_info, \
                                                opera18days_cell_lines, opera18days_cell_lines_to_cond,\
                                                opera18days_cell_lines_for_disp, opera18days_reps, \
                                                opera18days_line_colors, opera18days_lines_order, \
                                                opera18days_custom_palette, opera18days_expected_dapi_raw, \
                                                markers
%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
In [5]:
df = log_files_qc(LOGS_PATH,only_wt_cond=False, filename_split='-',site_location=0)
df_dapi = df[df.marker=='DAPI']
df_target = df[df.marker!='DAPI']
reading logs of batch2
reading logs of batch1

Total of 4 files were read.
Before dup handeling  (131158, 21)
After duplication removal #1: (120135, 22)
After duplication removal #2: (120135, 22)
In [6]:
# choose batches
batches = ['batch1', 'batch2']
batches
Out[6]:
['batch1', 'batch2']

Actual Files Validation¶

Raw Files Validation¶

  1. How many site tiff files do we have in each folder?
  2. Are all existing files valid? (tif or tiff, at least 1MB, not corrupetd)
In [7]:
root_directory_raw = os.path.join(NOVA_DATA_HOME, 'images', 'raw', 'Opera18DaysReimaged_sorted')

batches_raw = [batch.replace("_16bit_no_downsample","") for batch in batches]
raws = run_validate_folder_structure(root_directory_raw, False, opera18days_panels, opera18days_markers.copy(),PLOT_PATH, opera18days_marker_info,
                                    opera18days_cell_lines_to_cond, opera18days_reps, opera18days_cell_lines_for_disp, opera18days_expected_dapi_raw,
                                     batches=batches_raw, fig_height=12)
batch1
Folder structure is valid.
No bad files are found.
Total Sites:  82000
========
batch2
Folder structure is invalid. Missing 2 paths:
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/raw/Opera18DaysReimaged_sorted/batch2/FUSHomozygous/panelD/stress/rep1/CLTC
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/raw/Opera18DaysReimaged_sorted/batch2/FUSHomozygous/panelD/stress/rep2/CLTC
No bad files are found.
Total Sites:  81800
========
====================

Processed Files Validation¶

  1. How many site npy files do we have in each folder? -> How many sites survived the pre-processing?
  2. Are all existing files valid? (at least 100kB, npy not corrupted)
In [8]:
root_directory_proc = os.path.join(NOVA_DATA_HOME, 'images', 'processed', 'ManuscriptFinalData_80pct', 'neuronsDay18')
procs = run_validate_folder_structure(root_directory_proc, True, opera18days_panels, opera18days_markers,PLOT_PATH,opera18days_marker_info,
                                    opera18days_cell_lines_to_cond, opera18days_reps, opera18days_cell_lines_for_disp, opera18days_expected_dapi_raw,
                                     batches=batches, fig_height=12)
batch1
Folder structure is invalid. Missing 10 paths:
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch1/WT/Untreated/CD41
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch1/WT/stress/CD41
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch1/FUSHomozygous/Untreated/CD41
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch1/FUSHomozygous/stress/CD41
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch1/FUSHeterozygous/Untreated/CD41
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch1/TBK1/Untreated/CD41
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch1/TDP43/Untreated/CD41
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch1/FUSRevertant/Untreated/CD41
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch1/OPTN/Untreated/CD41
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch1/SNCA/Untreated/CD41
No bad files are found.
Total Sites:  54596
========
batch2
Folder structure is invalid. Missing 15 paths:
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/WT/Untreated/CD41
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/WT/stress/CD41
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/FUSHomozygous/Untreated/CD41
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/FUSHomozygous/stress/CLTC
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/FUSHomozygous/stress/CD41
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/FUSHeterozygous/Untreated/CD41
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/TBK1/Untreated/CD41
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/TDP43/Untreated/CD41
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/FUSRevertant/Untreated/SQSTM1
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/FUSRevertant/Untreated/FMRP
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/FUSRevertant/Untreated/CD41
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/FUSRevertant/Untreated/Phalloidin
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/OPTN/Untreated/CD41
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/SNCA/Untreated/CD41
/home/projects/hornsteinlab/Collaboration/NOVA/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/SNCA/Untreated/Tubulin
No bad files are found.
Total Sites:  52681
========
====================

Difference between Raw and Processed¶

In [9]:
display_diff(batches, raws, procs, PLOT_PATH,fig_height=12)
batch1
========
batch2
========

Variance in each batch (of processed files)¶

In [10]:
#for batch in list(range(3,9)) + ['7_16bit','8_16bit','9_16bit']:  

for batch in batches:
    with contextlib.redirect_stdout(io.StringIO()):
        var = sample_and_calc_variance(root_directory_proc, batch, 
                                       sample_size_per_markers=200, cond_count=2, rep_count=len(opera18days_reps), 
                                       num_markers=len(opera18days_markers))
    print(f'{batch} var: ',var)
batch1 var:  0.041773310225922025
batch2 var:  0.040858276841131386

Preprocessing Filtering qc¶

By order of filtering

1. % site survival after Brenner on DAPI channel¶

Percentage out of the total sites

In [11]:
dapi_filter_by_brenner = show_site_survival_dapi_brenner(df_dapi,batches, opera18days_line_colors, opera18days_panels,
                                                         figsize=(10,6), reps = opera18days_reps)

2. % Site survival after Cellpose¶

Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.

A site will be filtered out if Cellpose found 0 cells in it.

In [12]:
dapi_filter_by_cellpose = show_site_survival_dapi_cellpose(df_dapi, batches, dapi_filter_by_brenner, 
                                                           opera18days_line_colors, opera18days_panels, reps = opera18days_reps,
                                                           figsize=(10,6))

3. % Site survival by tiling¶

Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.

A site will be filtered out if after tiling, no tile is containing at least one whole cell that Cellpose detected.

In [13]:
dapi_filter_by_tiling=show_site_survival_dapi_tiling(df_dapi, batches, dapi_filter_by_cellpose, 
                                                     opera18days_line_colors, opera18days_panels, figsize=(10,6),
                                                     reps = opera18days_reps)

4. % Site survival after Brenner on target channel¶

Percentage out of the sites that passed the previous filter. In parenthesis are absolute values (if different than the percentages).

In [14]:
show_site_survival_target_brenner(df_dapi, df_target, dapi_filter_by_tiling,
                                 figsize=(10,10), markers=opera18days_markers)

Statistics About the Processed Files¶

In [15]:
names = ['Total number of tiles', 'Total number of whole cells']
stats = ['n_valid_tiles','site_whole_cells_counts_sum','site_cell_count','site_cell_count_sum']
total_sum = calc_total_sums(df_target, df_dapi, stats, opera18days_markers)

Total tiles¶

In [16]:
## Are we using FMRP?
markers_for_d18 = markers.copy()
markers_for_d18.remove('TIA1')
total_sum[total_sum.marker.isin(markers_for_d18)].n_valid_tiles.sum()
Out[16]:
318137

Total whole nuclei in tiles¶

In [17]:
total_sum[total_sum.marker =='DAPI'].site_whole_cells_counts_sum.sum()
Out[17]:
78482.0

Total nuclei in sites¶

In [18]:
total_sum[total_sum.marker =='DAPI'].site_cell_count.sum()
Out[18]:
282929.0
In [19]:
show_total_sum_tables(total_sum)
n_valid_tiles % valid tiles site_whole_cells_counts_sum site_cell_count
batch1
count 800.000000 800.000000 800.000000 800.000000
mean 238.811250 2.388112 166.008750 580.743750
std 139.461294 1.394613 152.195517 392.834608
min 1.000000 0.010000 1.000000 2.000000
25% 132.000000 1.320000 93.000000 341.000000
50% 241.000000 2.410000 149.000000 563.000000
75% 332.250000 3.322500 208.000000 779.250000
max 583.000000 5.830000 2122.000000 4721.000000
sum 191049.000000 NaN 132807.000000 464595.000000
expected_count 450.000000 450.000000 450.000000 450.000000
n_valid_tiles % valid tiles site_whole_cells_counts_sum site_cell_count
batch2
count 792.000000 792.000000 792.000000 792.000000
mean 225.335859 2.253359 133.366162 509.843434
std 133.733052 1.337331 82.621522 298.209843
min 0.000000 0.000000 0.000000 1.000000
25% 122.750000 1.227500 72.750000 283.000000
50% 203.000000 2.030000 118.000000 465.000000
75% 323.250000 3.232500 185.250000 724.000000
max 662.000000 6.620000 648.000000 1917.000000
sum 178466.000000 NaN 105626.000000 403796.000000
expected_count 450.000000 450.000000 450.000000 450.000000
n valid tiles % valid tiles site_whole_cells_counts_sum site_cell_count
All batches
count 1592.000000 1592.000000 1592.000000 1592.000000
mean 232.107412 2.321074 149.769472 545.471734
std 136.764821 1.367648 123.665234 350.670068
min 0.000000 0.000000 0.000000 1.000000
25% 124.000000 1.240000 80.000000 307.750000
50% 226.000000 2.260000 134.000000 517.500000
75% 330.000000 3.300000 200.000000 758.000000
max 662.000000 6.620000 2122.000000 4721.000000
sum 369515.000000 NaN 238433.000000 868391.000000
expected_count 450.000000 450.000000 450.000000 450.000000

Show Total Tile Counts¶

For each batch, cell line, replicate and markerTotal number of tiles

In [20]:
df_no_empty_sites = df_dapi[df_dapi.n_valid_tiles !=0]
plot_cell_count(df_no_empty_sites, opera18days_lines_order, opera18days_custom_palette, y='site_cell_count_sum', 
                title='Cell Count Average per Site (from tiles)')

plot_cell_count(df_no_empty_sites, opera18days_lines_order, opera18days_custom_palette, y='site_whole_cells_counts_sum',
                title='Whole Cell Count Average per Site')

plot_cell_count(df_no_empty_sites, opera18days_lines_order, opera18days_custom_palette, y='site_cell_count',
               title='Cellpose Cell Count Average per Site')

Show Cell Count Statistics per Batch¶

In [21]:
df_no_empty_sites = df_dapi[df_dapi.n_valid_tiles !=0]
plot_cell_count(df_no_empty_sites, opera18days_lines_order, opera18days_custom_palette, y='site_cell_count_sum', 
                title='Cell Count Average per Site (from tiles)')

plot_cell_count(df_no_empty_sites, opera18days_lines_order, opera18days_custom_palette, y='site_whole_cells_counts_sum',
                title='Whole Cell Count Average per Site')

plot_cell_count(df_no_empty_sites, opera18days_lines_order, opera18days_custom_palette, y='site_cell_count',
               title='Cellpose Cell Count Average per Site')

Show Tiles per Site Statistics¶

In [22]:
df_dapi.groupby(['cell_line_cond']).n_valid_tiles.mean()
Out[22]:
cell_line_cond
FUSHeterozygous Untreated    2.924984
FUSHomozygous Untreated      3.255590
FUSHomozygous stress         2.503102
FUSRevertant Untreated       1.609745
OPTN Untreated               2.483955
SNCA Untreated               1.326592
TBK1 Untreated               1.895484
TDP43 Untreated              2.161961
WT Untreated                 3.902980
WT stress                    2.918544
Name: n_valid_tiles, dtype: float64
In [23]:
df_dapi[['site_cell_count']].mean()
Out[23]:
site_cell_count    6.211258
dtype: float64
In [24]:
plot_catplot(df_dapi, opera18days_custom_palette, opera18days_reps, 
             x='n_valid_tiles', x_title='valid tiles count', batch_min=1, batch_max=2)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1061: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'batch_rep'] = df['batch'] + " " + df['rep']

Show Mean of cell count in valid tiles¶

In [25]:
plot_hm_of_mean_cell_count_per_tile(df_dapi, split_by='rep', rows='cell_line', columns='panel')

Assessing Staining Reproducibility and Outliers¶

In [ ]:
for batch in batches:
    print(batch)
    run_calc_hist_new(f'{batch}',opera18days_cell_lines_for_disp, opera18days_markers, 
                           root_directory_raw, root_directory_proc, hist_sample=10,
                            sample_size_per_markers=200, ncols=7, nrows=5)
    print("="*30)
In [ ]:
# save notebook as HTML ( the HTML will be saved in the same folder the original script is)
from IPython.display import display, Javascript
display(Javascript('IPython.notebook.save_checkpoint();'))
os.system(f'jupyter nbconvert --to html tools/preprocessing_tools/qc_reports/qc_report_d18_Opera_80pct.ipynb --output {NOVA_HOME}/manuscript/preprocessing_qc_reports/qc_report_d18_Opera_80pct.html')
In [ ]: